library(DiceKriging)
mat2list <- function(X){
  
  # Turns the p columns of a matrix into a p length list,
  # with each column becoming an element of the list
  
  out <- vector(mode = 'list', length = ncol(X))
  for(i in 1:ncol(X)){
    out[[i]] <- X[ , i]
  }
  out
  
}

# Can this go in common data? Would be needed for checking emulator fits
# Create fit lists for the combined data wave00 level 1a and wave01
#Y_const_level1a_wave01_scaled_list <- mat2list(Y_const_level1a_wave01_scaled)

#fit_list_const_level1a_wave01 <- mclapply(X = Y_const_level1a_wave01_scaled_list , FUN = km, formula = ~., design = X_level1a_wave01,
#                                   mc.cores = 4, control = list(trace = FALSE))


reset <- function() {
  # Allows annotation of graphs, resets axes
  par(mfrow=c(1, 1), oma=rep(0, 4), mar=rep(0, 4), new=TRUE)
  plot(0:1, 0:1, type="n", xlab="", ylab="", axes=FALSE)
}

makeTransparent<-function(someColor, alpha=100)
  # Transparent colours for plotting
{
  newColor<-col2rgb(someColor)
  apply(newColor, 2, function(curcoldata){rgb(red=curcoldata[1], green=curcoldata[2],
                                              blue=curcoldata[3],alpha=alpha, maxColorValue=255)})
}
load('data/ensemble-jules-historical-timeseries.RData')
count_na <- function(x){
  
 sum(is.na(x))
}

count_na(npp_ens_wave00)
## [1] 0
ens_charvec <- ls(pattern = "ens_wave00")

for(i in ens_charvec){
  
  
  print(count_na(get(i)))
}
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0

Split out training and test data

# How about we look at post 1950? Maybe that is messing it up?
years_ix <- 101:164

train_ix <- (1:399)[-288] # Number 288 has numerical problems for NPP
test_ix  <- 400:499

X_train <- X[train_ix, ]
X_test  <- X[test_ix, ]

Y_train <- npp_ens_wave00[train_ix, years_ix ]
Y_test  <- npp_ens_wave00[test_ix, years_ix]


years_trunc <- years[years_ix]

Plot the training set

matplot(years_trunc, t(Y_train), type = 'l', lty = 'solid', col = 'black', xlab = 'years', ylab = 'training output', ylim = c(0,180))

## Perform PCA

Reconstruct the training data with the truncated number of PCs - this is the smallest error we can expect with an emulation.

# perform a PCA on the training output
pca <- prcomp(Y_train, center = TRUE, scale = TRUE)

plot(pca)

# How many principle components do we wish to keep? 
npc <- 3

scores <- pca$x[ ,1:npc]

# project the truncated scores back up, to see how well they reproduce the 
# original data
  
anom <- pca$rotation[ ,1:npc] %*% t(scores)*pca$scale
tens <- t(anom + pca$center)
matplot(years_trunc, t(Y_train), type = 'l', lty = 'solid', col = 'black', xlab = 'years', ylab = 'training output', ylim = c(0,180))
matlines(years_trunc, t(tens),  lty = 'solid', col = 'red', xlab = 'years', ylab = 'training output')

# Plot the reconstruction error

Y_train_err <- tens - Y_train

matplot(years_trunc, t(Y_train_err), 
        type = 'l',
        lty = 'solid',
        col = 'black',
        xlab = 'years',
        ylab = 'training output',
        main = 'training reconstruction error - perfect scores',
        ylim = c(-5,5)
        )

Build an emulator for each of the PCs

scores_em_mean_test <- NULL
scores_em_sd_test <- NULL

scores_em_mean_train <- NULL

for(i in 1:npc){
  #
  y <- pca$x[,i]
  fit <- km(~., design = X_train, response = y)
  loo <- leaveOneOut.km(fit, type = 'UK', trend.reestim = TRUE)
  
  
  pred_test <- predict(fit, newdata = X_test, type = 'UK')
  
  scores_em_mean <- pred_test$mean
  scores_em_sd <- pred_test$sd
  
  scores_em_mean_test <- cbind(scores_em_mean_test, scores_em_mean)
  scores_em_sd_test   <- cbind(scores_em_sd_test, scores_em_sd)
  
  scores_em_mean_train <- cbind(scores_em_mean_train, loo$mean)
  
}
## 
## optimisation start
## ------------------
## * estimation method   : MLE 
## * optimisation method : BFGS 
## * analytical gradient : used
## * trend model : ~alpha_io + a_wl_io + bio_hum_cn + b_wl_io + dcatch_dlai_io + 
##     dqcrit_io + dz0v_dh_io + f0_io + fd_io + g_area_io + g_root_io + 
##     g_wood_io + gs_nvg_io + hw_sw_io + kaps_roth + knl_io + lai_max_io + 
##     lai_min_io + lma_io + n_inorg_turnover + nmass_io + nr_io + 
##     retran_l_io + retran_r_io + r_grow_io + rootd_ft_io + sigl_io + 
##     sorp + tleaf_of_io + tlow_io + tupp_io + l_vg_soil
## * covariance model : 
##   - type :  matern5_2 
##   - nugget : NO
##   - parameters lower bounds :  1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 
##   - parameters upper bounds :  1.998344 2 1.994471 1.999927 2 2 2 1.996699 2 1.996789 2 2 2 2 1.999374 1.99598 1.998257 2 2 1.996693 1.998361 2 2 2 1.991674 2 2 1.986556 2 2 2 1.998477 
##   - best initial criterion value(s) :  -1283.624 
## 
## N = 32, M = 5 machine precision = 2.22045e-16
## At X0, 0 variables are exactly at the bounds
## At iterate     0  f=       1283.6  |proj g|=        1.911
## At iterate     1  f =         1269  |proj g|=        1.8736
## At iterate     2  f =       1240.6  |proj g|=        1.7741
## At iterate     3  f =       1236.1  |proj g|=        1.9164
## At iterate     4  f =       1226.2  |proj g|=        1.8193
## ys=-2.199e+00  -gs= 9.530e+00, BFGS update SKIPPED
## At iterate     5  f =       1219.8  |proj g|=        1.5641
## ys=-5.732e-01  -gs= 6.016e+00, BFGS update SKIPPED
## At iterate     6  f =         1210  |proj g|=        1.6505
## ys=-1.732e+00  -gs= 8.869e+00, BFGS update SKIPPED
## At iterate     7  f =       1203.1  |proj g|=        1.7473
## ys=-2.964e-02  -gs= 6.788e+00, BFGS update SKIPPED
## At iterate     8  f =       1197.6  |proj g|=        1.8377
## At iterate     9  f =       1193.3  |proj g|=        1.8289
## At iterate    10  f =       1192.6  |proj g|=        1.8052
## At iterate    11  f =       1192.1  |proj g|=        1.8215
## At iterate    12  f =       1191.9  |proj g|=        1.7826
## At iterate    13  f =       1191.5  |proj g|=        1.7646
## At iterate    14  f =       1190.2  |proj g|=        1.8236
## At iterate    15  f =       1189.8  |proj g|=        1.7157
## At iterate    16  f =       1189.7  |proj g|=        1.6587
## At iterate    17  f =       1188.9  |proj g|=        1.8242
## At iterate    18  f =       1188.6  |proj g|=        1.8197
## At iterate    19  f =       1188.5  |proj g|=        1.5593
## At iterate    20  f =       1188.3  |proj g|=        1.5372
## At iterate    21  f =       1188.1  |proj g|=        1.5102
## At iterate    22  f =         1188  |proj g|=        1.5038
## At iterate    23  f =       1187.8  |proj g|=        1.8205
## At iterate    24  f =       1186.3  |proj g|=        1.8429
## At iterate    25  f =       1184.9  |proj g|=        1.8193
## At iterate    26  f =         1184  |proj g|=        1.7606
## At iterate    27  f =       1182.3  |proj g|=        1.7419
## At iterate    28  f =         1182  |proj g|=        1.7423
## At iterate    29  f =       1181.8  |proj g|=         1.815
## At iterate    30  f =       1181.7  |proj g|=        1.8134
## At iterate    31  f =       1181.6  |proj g|=        1.8126
## At iterate    32  f =       1181.4  |proj g|=        1.7473
## At iterate    33  f =       1181.2  |proj g|=        1.7455
## At iterate    34  f =         1181  |proj g|=        0.8349
## At iterate    35  f =       1180.8  |proj g|=         1.805
## At iterate    36  f =       1180.8  |proj g|=       0.97958
## At iterate    37  f =       1180.8  |proj g|=       0.94645
## At iterate    38  f =       1180.8  |proj g|=       0.90469
## At iterate    39  f =       1180.8  |proj g|=       0.79804
## At iterate    40  f =       1180.7  |proj g|=       0.89866
## At iterate    41  f =       1180.7  |proj g|=        1.7465
## At iterate    42  f =       1180.7  |proj g|=        0.2816
## At iterate    43  f =       1180.7  |proj g|=       0.27781
## At iterate    44  f =       1180.7  |proj g|=       0.26052
## At iterate    45  f =       1180.7  |proj g|=       0.26042
## At iterate    46  f =       1180.7  |proj g|=       0.25903
## At iterate    47  f =       1180.7  |proj g|=        0.4029
## At iterate    48  f =       1180.7  |proj g|=       0.33381
## At iterate    49  f =       1180.7  |proj g|=       0.44174
## At iterate    50  f =       1180.7  |proj g|=        1.1685
## At iterate    51  f =       1180.7  |proj g|=       0.56017
## At iterate    52  f =       1180.7  |proj g|=       0.37387
## At iterate    53  f =       1180.7  |proj g|=       0.19494
## At iterate    54  f =       1180.7  |proj g|=       0.12967
## At iterate    55  f =       1180.7  |proj g|=       0.10408
## At iterate    56  f =       1180.7  |proj g|=       0.11899
## At iterate    57  f =       1180.7  |proj g|=       0.18916
## At iterate    58  f =       1180.7  |proj g|=       0.11524
## At iterate    59  f =       1180.7  |proj g|=       0.10869
## At iterate    60  f =       1180.7  |proj g|=       0.09803
## At iterate    61  f =       1180.7  |proj g|=       0.01086
## At iterate    62  f =       1180.7  |proj g|=      0.047212
## At iterate    63  f =       1180.7  |proj g|=      0.051041
## At iterate    64  f =       1180.7  |proj g|=        0.1066
## At iterate    65  f =       1180.7  |proj g|=      0.061185
## At iterate    66  f =       1180.7  |proj g|=       0.10953
## At iterate    67  f =       1180.7  |proj g|=      0.053443
## At iterate    68  f =       1180.7  |proj g|=     0.0087198
## At iterate    69  f =       1180.7  |proj g|=      0.016103
## At iterate    70  f =       1180.7  |proj g|=      0.019865
## 
## iterations 70
## function evaluations 87
## segments explored during Cauchy searches 89
## BFGS updates skipped 4
## active bounds at final generalized Cauchy point 26
## norm of the final projected gradient 0.0198651
## final function value 1180.69
## 
## F = 1180.69
## final  value 1180.686617 
## converged
## 
## optimisation start
## ------------------
## * estimation method   : MLE 
## * optimisation method : BFGS 
## * analytical gradient : used
## * trend model : ~alpha_io + a_wl_io + bio_hum_cn + b_wl_io + dcatch_dlai_io + 
##     dqcrit_io + dz0v_dh_io + f0_io + fd_io + g_area_io + g_root_io + 
##     g_wood_io + gs_nvg_io + hw_sw_io + kaps_roth + knl_io + lai_max_io + 
##     lai_min_io + lma_io + n_inorg_turnover + nmass_io + nr_io + 
##     retran_l_io + retran_r_io + r_grow_io + rootd_ft_io + sigl_io + 
##     sorp + tleaf_of_io + tlow_io + tupp_io + l_vg_soil
## * covariance model : 
##   - type :  matern5_2 
##   - nugget : NO
##   - parameters lower bounds :  1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 
##   - parameters upper bounds :  1.998344 2 1.994471 1.999927 2 2 2 1.996699 2 1.996789 2 2 2 2 1.999374 1.99598 1.998257 2 2 1.996693 1.998361 2 2 2 1.991674 2 2 1.986556 2 2 2 1.998477 
##   - best initial criterion value(s) :  167.712 
## 
## N = 32, M = 5 machine precision = 2.22045e-16
## At X0, 0 variables are exactly at the bounds
## At iterate     0  f=      -167.71  |proj g|=       1.6558
## At iterate     1  f =       -170.3  |proj g|=        1.9572
## At iterate     2  f =      -170.72  |proj g|=        1.9361
## At iterate     3  f =      -170.84  |proj g|=        1.6593
## At iterate     4  f =      -171.18  |proj g|=        1.6306
## At iterate     5  f =      -171.65  |proj g|=        1.8433
## At iterate     6  f =      -172.06  |proj g|=        1.7997
## At iterate     7  f =      -172.62  |proj g|=        1.9289
## At iterate     8  f =      -172.83  |proj g|=        1.2723
## At iterate     9  f =      -173.77  |proj g|=        1.9021
## At iterate    10  f =      -174.43  |proj g|=         1.871
## At iterate    11  f =      -174.92  |proj g|=        1.8544
## ys=-4.728e-02  -gs= 4.708e-01, BFGS update SKIPPED
## At iterate    12  f =      -177.77  |proj g|=        1.7932
## At iterate    13  f =      -181.31  |proj g|=        1.6854
## At iterate    14  f =      -187.68  |proj g|=        1.5381
## At iterate    15  f =      -192.96  |proj g|=        1.6002
## At iterate    16  f =      -194.38  |proj g|=        1.4773
## At iterate    17  f =      -195.63  |proj g|=        1.6936
## At iterate    18  f =      -196.47  |proj g|=        1.7131
## At iterate    19  f =      -196.95  |proj g|=        1.6404
## At iterate    20  f =      -197.67  |proj g|=        1.6986
## At iterate    21  f =      -198.48  |proj g|=        1.5499
## At iterate    22  f =       -198.8  |proj g|=        1.4856
## At iterate    23  f =      -199.91  |proj g|=        1.7139
## At iterate    24  f =      -201.15  |proj g|=        1.1927
## At iterate    25  f =      -201.58  |proj g|=        1.0805
## At iterate    26  f =      -201.92  |proj g|=        1.7084
## At iterate    27  f =       -202.2  |proj g|=        1.6869
## At iterate    28  f =      -202.34  |proj g|=         1.136
## At iterate    29  f =      -202.47  |proj g|=       0.84481
## At iterate    30  f =      -202.65  |proj g|=       0.99603
## At iterate    31  f =      -202.78  |proj g|=        1.6903
## At iterate    32  f =      -202.88  |proj g|=        1.6884
## At iterate    33  f =      -202.96  |proj g|=       0.60062
## At iterate    34  f =      -202.98  |proj g|=       0.40819
## At iterate    35  f =      -203.06  |proj g|=       0.83351
## At iterate    36  f =      -203.14  |proj g|=       0.93846
## At iterate    37  f =      -203.17  |proj g|=       0.83267
## At iterate    38  f =      -203.23  |proj g|=       0.79981
## At iterate    39  f =      -203.25  |proj g|=       0.65264
## At iterate    40  f =      -203.32  |proj g|=        0.4391
## At iterate    41  f =      -203.34  |proj g|=       0.45939
## At iterate    42  f =       -203.4  |proj g|=       0.54628
## At iterate    43  f =      -203.43  |proj g|=       0.92779
## At iterate    44  f =      -203.46  |proj g|=       0.65543
## At iterate    45  f =      -203.49  |proj g|=       0.54152
## At iterate    46  f =       -203.5  |proj g|=        0.1871
## At iterate    47  f =       -203.5  |proj g|=       0.19936
## At iterate    48  f =       -203.5  |proj g|=        0.3244
## At iterate    49  f =      -203.51  |proj g|=       0.32018
## At iterate    50  f =      -203.52  |proj g|=       0.33853
## At iterate    51  f =      -203.52  |proj g|=       0.16434
## At iterate    52  f =      -203.52  |proj g|=       0.28014
## At iterate    53  f =      -203.52  |proj g|=       0.32037
## At iterate    54  f =      -203.53  |proj g|=       0.55584
## At iterate    55  f =      -203.53  |proj g|=       0.25956
## At iterate    56  f =      -203.53  |proj g|=       0.14111
## At iterate    57  f =      -203.53  |proj g|=       0.11963
## At iterate    58  f =      -203.53  |proj g|=       0.15618
## At iterate    59  f =      -203.54  |proj g|=       0.24496
## At iterate    60  f =      -203.54  |proj g|=       0.11887
## At iterate    61  f =      -203.54  |proj g|=      0.072762
## At iterate    62  f =      -203.54  |proj g|=      0.029363
## At iterate    63  f =      -203.54  |proj g|=      0.047519
## At iterate    64  f =      -203.54  |proj g|=      0.039717
## At iterate    65  f =      -203.54  |proj g|=      0.023013
## At iterate    66  f =      -203.54  |proj g|=      0.022067
## At iterate    67  f =      -203.54  |proj g|=      0.019308
## At iterate    68  f =      -203.54  |proj g|=      0.014468
## At iterate    69  f =      -203.54  |proj g|=      0.030188
## At iterate    70  f =      -203.54  |proj g|=      0.015508
## At iterate    71  f =      -203.54  |proj g|=     0.0046804
## At iterate    72  f =      -203.54  |proj g|=     0.0041961
## At iterate    73  f =      -203.54  |proj g|=     0.0052956
## At iterate    74  f =      -203.54  |proj g|=     0.0084188
## At iterate    75  f =      -203.54  |proj g|=      0.006971
## At iterate    76  f =      -203.54  |proj g|=     0.0019931
## At iterate    77  f =      -203.54  |proj g|=     0.0018218
## 
## iterations 77
## function evaluations 84
## segments explored during Cauchy searches 88
## BFGS updates skipped 1
## active bounds at final generalized Cauchy point 19
## norm of the final projected gradient 0.00182179
## final function value -203.537
## 
## F = -203.537
## final  value -203.537366 
## converged
## 
## optimisation start
## ------------------
## * estimation method   : MLE 
## * optimisation method : BFGS 
## * analytical gradient : used
## * trend model : ~alpha_io + a_wl_io + bio_hum_cn + b_wl_io + dcatch_dlai_io + 
##     dqcrit_io + dz0v_dh_io + f0_io + fd_io + g_area_io + g_root_io + 
##     g_wood_io + gs_nvg_io + hw_sw_io + kaps_roth + knl_io + lai_max_io + 
##     lai_min_io + lma_io + n_inorg_turnover + nmass_io + nr_io + 
##     retran_l_io + retran_r_io + r_grow_io + rootd_ft_io + sigl_io + 
##     sorp + tleaf_of_io + tlow_io + tupp_io + l_vg_soil
## * covariance model : 
##   - type :  matern5_2 
##   - nugget : NO
##   - parameters lower bounds :  1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 
##   - parameters upper bounds :  1.998344 2 1.994471 1.999927 2 2 2 1.996699 2 1.996789 2 2 2 2 1.999374 1.99598 1.998257 2 2 1.996693 1.998361 2 2 2 1.991674 2 2 1.986556 2 2 2 1.998477 
##   - best initial criterion value(s) :  681.8778 
## 
## N = 32, M = 5 machine precision = 2.22045e-16
## At X0, 0 variables are exactly at the bounds
## At iterate     0  f=      -681.88  |proj g|=       1.8862
## At iterate     1  f =      -685.39  |proj g|=        1.6969
## At iterate     2  f =      -690.58  |proj g|=        1.7887
## At iterate     3  f =      -691.69  |proj g|=        1.8782
## At iterate     4  f =      -695.57  |proj g|=         1.474
## At iterate     5  f =      -696.51  |proj g|=        1.4301
## At iterate     6  f =      -699.81  |proj g|=        1.6632
## At iterate     7  f =      -703.66  |proj g|=        1.8304
## At iterate     8  f =      -705.51  |proj g|=        1.5102
## At iterate     9  f =      -706.54  |proj g|=        1.5512
## At iterate    10  f =      -708.23  |proj g|=        1.8447
## At iterate    11  f =       -708.9  |proj g|=        1.6452
## At iterate    12  f =      -709.09  |proj g|=         1.637
## At iterate    13  f =      -709.32  |proj g|=        1.6327
## At iterate    14  f =      -709.99  |proj g|=         1.822
## At iterate    15  f =      -710.27  |proj g|=        1.0794
## At iterate    16  f =      -710.64  |proj g|=        1.1016
## At iterate    17  f =      -710.82  |proj g|=        1.3344
## At iterate    18  f =      -711.56  |proj g|=        1.6335
## At iterate    19  f =      -711.68  |proj g|=        1.4616
## At iterate    20  f =      -711.81  |proj g|=         1.083
## At iterate    21  f =      -712.22  |proj g|=        1.0842
## At iterate    22  f =      -712.58  |proj g|=        1.1808
## At iterate    23  f =      -712.68  |proj g|=       0.99465
## At iterate    24  f =      -712.73  |proj g|=       0.74038
## At iterate    25  f =      -712.78  |proj g|=        1.8315
## At iterate    26  f =      -712.81  |proj g|=        1.8336
## At iterate    27  f =      -712.94  |proj g|=        1.8352
## At iterate    28  f =      -713.02  |proj g|=        1.8172
## At iterate    29  f =      -713.05  |proj g|=       0.66021
## At iterate    30  f =      -713.09  |proj g|=       0.43437
## At iterate    31  f =      -713.12  |proj g|=       0.44507
## At iterate    32  f =      -713.21  |proj g|=        1.7356
## At iterate    33  f =      -713.26  |proj g|=        1.3057
## At iterate    34  f =      -713.29  |proj g|=        1.4468
## At iterate    35  f =      -713.32  |proj g|=        1.4686
## At iterate    36  f =      -713.37  |proj g|=        1.4705
## At iterate    37  f =      -713.39  |proj g|=       0.37368
## At iterate    38  f =       -713.4  |proj g|=        0.2782
## At iterate    39  f =       -713.4  |proj g|=        1.0599
## At iterate    40  f =       -713.4  |proj g|=       0.19422
## At iterate    41  f =      -713.41  |proj g|=       0.19192
## At iterate    42  f =      -713.41  |proj g|=       0.46662
## At iterate    43  f =      -713.42  |proj g|=       0.58493
## At iterate    44  f =      -713.43  |proj g|=       0.45128
## At iterate    45  f =      -713.45  |proj g|=        1.6998
## At iterate    46  f =      -713.45  |proj g|=       0.58155
## At iterate    47  f =      -713.46  |proj g|=       0.54353
## At iterate    48  f =      -713.47  |proj g|=        0.2181
## At iterate    49  f =      -713.47  |proj g|=       0.23092
## At iterate    50  f =      -713.47  |proj g|=       0.39126
## At iterate    51  f =      -713.47  |proj g|=       0.54593
## At iterate    52  f =      -713.48  |proj g|=       0.43935
## At iterate    53  f =      -713.48  |proj g|=       0.68417
## At iterate    54  f =      -713.48  |proj g|=       0.15831
## At iterate    55  f =      -713.48  |proj g|=       0.11442
## At iterate    56  f =      -713.48  |proj g|=       0.16225
## At iterate    57  f =      -713.49  |proj g|=       0.16222
## At iterate    58  f =      -713.49  |proj g|=      0.093246
## At iterate    59  f =      -713.49  |proj g|=       0.16234
## At iterate    60  f =      -713.49  |proj g|=       0.11056
## At iterate    61  f =      -713.49  |proj g|=       0.13681
## At iterate    62  f =      -713.49  |proj g|=      0.086683
## At iterate    63  f =      -713.49  |proj g|=      0.071366
## At iterate    64  f =      -713.49  |proj g|=      0.073598
## At iterate    65  f =      -713.49  |proj g|=       0.37024
## At iterate    66  f =      -713.49  |proj g|=       0.37446
## At iterate    67  f =      -713.49  |proj g|=       0.25901
## At iterate    68  f =      -713.49  |proj g|=       0.43179
## At iterate    69  f =      -713.49  |proj g|=       0.16825
## At iterate    70  f =      -713.49  |proj g|=       0.16214
## At iterate    71  f =      -713.49  |proj g|=       0.16223
## At iterate    72  f =      -713.49  |proj g|=      0.097911
## At iterate    73  f =      -713.49  |proj g|=       0.06467
## At iterate    74  f =      -713.49  |proj g|=        0.1245
## At iterate    75  f =      -713.49  |proj g|=      0.075958
## At iterate    76  f =      -713.49  |proj g|=        0.0425
## At iterate    77  f =      -713.49  |proj g|=      0.041319
## At iterate    78  f =       -713.5  |proj g|=       0.10335
## At iterate    79  f =       -713.5  |proj g|=      0.092153
## At iterate    80  f =       -713.5  |proj g|=       0.36409
## At iterate    81  f =       -713.5  |proj g|=       0.11798
## At iterate    82  f =       -713.5  |proj g|=      0.034605
## At iterate    83  f =       -713.5  |proj g|=      0.021465
## At iterate    84  f =       -713.5  |proj g|=      0.024109
## At iterate    85  f =       -713.5  |proj g|=        0.0423
## At iterate    86  f =       -713.5  |proj g|=      0.046687
## At iterate    87  f =       -713.5  |proj g|=      0.024203
## At iterate    88  f =       -713.5  |proj g|=       0.16315
## At iterate    89  f =       -713.5  |proj g|=       0.10832
## At iterate    90  f =       -713.5  |proj g|=      0.073464
## At iterate    91  f =       -713.5  |proj g|=       0.10057
## At iterate    92  f =       -713.5  |proj g|=       0.10677
## At iterate    93  f =       -713.5  |proj g|=       0.10335
## At iterate    94  f =       -713.5  |proj g|=        0.2692
## At iterate    95  f =       -713.5  |proj g|=       0.18251
## At iterate    96  f =       -713.5  |proj g|=       0.16739
## At iterate    97  f =       -713.5  |proj g|=       0.25655
## At iterate    98  f =       -713.5  |proj g|=       0.28924
## At iterate    99  f =       -713.5  |proj g|=       0.39099
## At iterate   100  f =       -713.5  |proj g|=       0.18433
## At iterate   101  f =       -713.5  |proj g|=      0.078975
## final  value -713.501662 
## stopped after 101 iterations
anom_loo <- pca$rotation[ ,1:2] %*% t(scores_em_mean_train[,1:2])*pca$scale
tens_loo <- t(anom_loo + pca$center)

anom_test <- pca$rotation[ ,1:2] %*% t(scores_em_mean_test[,1:2])*pca$scale
tens_test <- t(anom_test + pca$center)
par(mfrow = c(1,3))

for(i in 1:npc){
  
  plot( pca$x[,i], scores_em_mean_train[,i])
  abline(0,1)
  
}

Y_loo_err <- tens_loo - Y_train

par(mfrow = c(1,2))

matplot(years_trunc, t(Y_train), 
        type = 'l',
        lty = 'solid',
        col = 'black',
        xlab = 'years',
        ylab = 'test',
        main = 'Leave-one-out prediction error',
        ylim = c(-50,200)
        )

matlines(years_trunc, t(tens_loo), 
        type = 'l',
        lty = 'solid',
        col = 'red',
        )

matplot(years_trunc, t(Y_loo_err), 
        type = 'l',
        lty = 'solid',
        col = 'black',
        xlab = 'years',
        ylab = 'test - predicted output',
        main = 'test prediction error',
        ylim = c(-100,100)
        )

Y_test_err <- tens_test - Y_test

par(mfrow = c(1,2))

matplot(years_trunc, t(Y_test), 
        type = 'l',
        lty = 'solid',
        col = 'black',
        xlab = 'years',
        ylab = 'test',
        main = 'Y_test',
        ylim = c(-50,200)
        )

matlines(years_trunc, t(tens_test), 
        type = 'l',
        lty = 'solid',
        col = 'red',
        )

matplot(years_trunc, t(Y_test_err), 
        type = 'l',
        lty = 'solid',
        col = 'black',
        xlab = 'years',
        ylab = 'test - predicted output',
        main = 'test prediction error',
        ylim = c(-100,100)
        )

## Is this right? How about testing how good emulating the first (or last) point is? Are the “zero carbon cycle” and “failures” causing problems with the emulator?

y_sp_train <- Y_train[ ,1]
y_sp_test  <- Y_test[ ,1]

fit <- km(~., design = X_train, response = y_sp_train)
## 
## optimisation start
## ------------------
## * estimation method   : MLE 
## * optimisation method : BFGS 
## * analytical gradient : used
## * trend model : ~alpha_io + a_wl_io + bio_hum_cn + b_wl_io + dcatch_dlai_io + 
##     dqcrit_io + dz0v_dh_io + f0_io + fd_io + g_area_io + g_root_io + 
##     g_wood_io + gs_nvg_io + hw_sw_io + kaps_roth + knl_io + lai_max_io + 
##     lai_min_io + lma_io + n_inorg_turnover + nmass_io + nr_io + 
##     retran_l_io + retran_r_io + r_grow_io + rootd_ft_io + sigl_io + 
##     sorp + tleaf_of_io + tlow_io + tupp_io + l_vg_soil
## * covariance model : 
##   - type :  matern5_2 
##   - nugget : NO
##   - parameters lower bounds :  1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 
##   - parameters upper bounds :  1.998344 2 1.994471 1.999927 2 2 2 1.996699 2 1.996789 2 2 2 2 1.999374 1.99598 1.998257 2 2 1.996693 1.998361 2 2 2 1.991674 2 2 1.986556 2 2 2 1.998477 
##   - best initial criterion value(s) :  -1731.7 
## 
## N = 32, M = 5 machine precision = 2.22045e-16
## At X0, 0 variables are exactly at the bounds
## At iterate     0  f=       1731.7  |proj g|=       1.8431
## At iterate     1  f =       1725.7  |proj g|=        1.7078
## At iterate     2  f =       1715.7  |proj g|=        1.7668
## At iterate     3  f =       1711.2  |proj g|=        1.7149
## At iterate     4  f =       1699.6  |proj g|=        1.9473
## At iterate     5  f =       1698.2  |proj g|=        1.7839
## At iterate     6  f =       1694.1  |proj g|=        1.8427
## ys=-1.848e-01  -gs= 4.051e+00, BFGS update SKIPPED
## At iterate     7  f =       1676.7  |proj g|=        1.9325
## At iterate     8  f =       1665.8  |proj g|=        1.9131
## At iterate     9  f =         1660  |proj g|=        1.8914
## At iterate    10  f =       1654.5  |proj g|=        1.8321
## At iterate    11  f =       1652.6  |proj g|=        1.8544
## At iterate    12  f =         1651  |proj g|=        1.1765
## At iterate    13  f =       1649.2  |proj g|=        1.1722
## At iterate    14  f =       1648.2  |proj g|=        1.8488
## At iterate    15  f =       1646.9  |proj g|=        1.8481
## At iterate    16  f =       1642.9  |proj g|=        1.0939
## At iterate    17  f =         1639  |proj g|=        1.0351
## At iterate    18  f =       1638.5  |proj g|=        1.8282
## At iterate    19  f =       1637.4  |proj g|=        1.8254
## At iterate    20  f =       1635.6  |proj g|=        1.7282
## At iterate    21  f =       1635.2  |proj g|=        1.5999
## At iterate    22  f =       1633.5  |proj g|=        1.8229
## At iterate    23  f =         1633  |proj g|=        1.8169
## At iterate    24  f =       1631.2  |proj g|=        1.1789
## At iterate    25  f =       1630.6  |proj g|=        1.8223
## At iterate    26  f =       1630.4  |proj g|=        1.8146
## At iterate    27  f =       1630.3  |proj g|=        1.7245
## At iterate    28  f =       1630.2  |proj g|=        1.4229
## At iterate    29  f =       1629.8  |proj g|=        1.8136
## At iterate    30  f =       1629.2  |proj g|=        1.2648
## At iterate    31  f =       1629.1  |proj g|=        1.8165
## At iterate    32  f =       1628.8  |proj g|=        1.8172
## At iterate    33  f =       1628.6  |proj g|=        1.7298
## At iterate    34  f =       1628.5  |proj g|=        1.2649
## At iterate    35  f =       1628.5  |proj g|=        1.2429
## At iterate    36  f =       1628.3  |proj g|=        1.1536
## At iterate    37  f =       1628.1  |proj g|=        1.1453
## At iterate    38  f =       1628.1  |proj g|=        1.8103
## At iterate    39  f =         1628  |proj g|=        1.8124
## At iterate    40  f =       1627.9  |proj g|=       0.60535
## At iterate    41  f =       1627.9  |proj g|=        0.6034
## At iterate    42  f =       1627.9  |proj g|=       0.96437
## At iterate    43  f =       1627.9  |proj g|=       0.34309
## At iterate    44  f =       1627.9  |proj g|=       0.39325
## At iterate    45  f =       1627.9  |proj g|=       0.42079
## At iterate    46  f =       1627.9  |proj g|=       0.23201
## At iterate    47  f =       1627.9  |proj g|=       0.30153
## At iterate    48  f =       1627.9  |proj g|=       0.57704
## At iterate    49  f =       1627.9  |proj g|=       0.26205
## At iterate    50  f =       1627.9  |proj g|=       0.24962
## At iterate    51  f =       1627.9  |proj g|=       0.25086
## At iterate    52  f =       1627.9  |proj g|=       0.39192
## At iterate    53  f =       1627.9  |proj g|=        1.0372
## At iterate    54  f =       1627.9  |proj g|=       0.66916
## At iterate    55  f =       1627.9  |proj g|=       0.37207
## At iterate    56  f =       1627.9  |proj g|=       0.55944
## At iterate    57  f =       1627.9  |proj g|=       0.25217
## At iterate    58  f =       1627.9  |proj g|=       0.28757
## At iterate    59  f =       1627.9  |proj g|=       0.10602
## At iterate    60  f =       1627.9  |proj g|=       0.14743
## At iterate    61  f =       1627.9  |proj g|=      0.051709
## At iterate    62  f =       1627.9  |proj g|=       0.10027
## At iterate    63  f =       1627.9  |proj g|=       0.20563
## At iterate    64  f =       1627.9  |proj g|=       0.11686
## At iterate    65  f =       1627.9  |proj g|=       0.16813
## At iterate    66  f =       1627.9  |proj g|=       0.19414
## At iterate    67  f =       1627.9  |proj g|=      0.093691
## At iterate    68  f =       1627.9  |proj g|=      0.097095
## At iterate    69  f =       1627.9  |proj g|=       0.10811
## At iterate    70  f =       1627.9  |proj g|=      0.057755
## At iterate    71  f =       1627.9  |proj g|=       0.08492
## At iterate    72  f =       1627.9  |proj g|=      0.027138
## At iterate    73  f =       1627.9  |proj g|=      0.018426
## 
## iterations 73
## function evaluations 94
## segments explored during Cauchy searches 84
## BFGS updates skipped 1
## active bounds at final generalized Cauchy point 25
## norm of the final projected gradient 0.018426
## final function value 1627.88
## 
## F = 1627.88
## final  value 1627.877583 
## converged
pred <- predict.km(fit, newdata = X_test, type = 'UK')

Predict NPP test set

par(mfrow = c(1,2))
plot(y_sp_test, pred$mean)
abline(0,1)

plot(y_sp_test - pred$mean )

twoStep_glmnet <- function(X, y, nugget=NULL, nuggetEstim=FALSE, noiseVar=NULL, seed=NULL, trace=FALSE, maxit=100,
                           REPORT=10, factr=1e7, pgtol=0.0, parinit=NULL, popsize=100){
  # Use lasso to reduce input dimension of emulator before
  # building.
  control_list = list(trace=trace, maxit=maxit, REPORT=REPORT, factr=factr, pgtol=pgtol, pop.size=popsize)
  xvars = colnames(X)
  data = data.frame(response=y, x=X)
  colnames(data) <- c("response", xvars)
  nval = length(y)
  
  # fit a lasso by cross validation
  library(glmnet)
  fit_glmnet_cv = cv.glmnet(x=X,y=y)
  
  # The labels of the retained coefficients are here
  # (retains intercept at index zero)
  coef_i = (coef(fit_glmnet_cv, s = "lambda.1se"))@i
  labs = labels(coef(fit_glmnet_cv, s = "lambda.1se"))[[1]]
  labs = labs[-1] # remove intercept
  glmnet_retained = labs[coef_i]
  
  start_form = as.formula(paste("~ ", paste(glmnet_retained , collapse= "+")))
  m = km(start_form, design=X, response=y, nugget=nugget, parinit=parinit,
         nugget.estim=nuggetEstim, noise.var=noiseVar, control=control_list)
  
  return(list(x=X, y=y, nugget=nugget, nuggetEstim=nuggetEstim,
              noiseVar=noiseVar, emulator=m, seed=seed, coefs=m@covariance@range.val,
              trends=m@trend.coef, meanTerms=all.vars(start_form), fit_glmnet_cv=fit_glmnet_cv))
}
twostep_test1 <- twoStep_glmnet(X = X_train, y = pca$x[,1])
## Loading required package: Matrix
## Loaded glmnet 4.1-2
twostep_test2 <- twoStep_glmnet(X = X_train, y = pca$x[,2])
twostep_test3 <- twoStep_glmnet(X = X_train, y = pca$x[,3])
twostep_loo1 <- leaveOneOut.km(model = twostep_test1$emulator, type = 'UK', trend.reestim = TRUE)
# The twostep error is a little lower. Not much though.

plot(pca$x[,1], scores_em_mean_train[,1])
points(pca$x[,1], twostep_loo1$mean, col = 'red')

abline(0,1)

mean(abs(scores_em_mean_train[,1] - pca$x[,1]))
## [1] 3.431702
mean(abs(twostep_loo1$mean - pca$x[,1]))
## [1] 3.371348

How well do we reproduce timeseries anomaly (trend, change) data?

anomalizeTSmatrix = function(x, ix){
  # Anomalise a timeseries matrix
  subx = x[ ,ix]
  sweepstats = apply(subx, 1, FUN=mean)
  anom = sweep(x, 1, sweepstats, FUN = '-')
  anom
}

npp_anom_ens_wave00 <- anomalizeTSmatrix(npp_ens_wave00, 90:111)
Y_anom <- npp_anom_ens_wave00
Y_train_anom <- Y_anom[train_ix, years_ix]
Y_test_anom  <- Y_anom[test_ix, years_ix]

pca_anom <- prcomp(Y_train_anom, center = TRUE, scale = TRUE)



par(mfrow = c(1,2))

matplot(years_trunc, t(Y_train_anom), 
        type = 'l',
        lty = 'solid',
        col = 'black',
        xlab = 'years',
        ylab = 'test',
        main = 'Leave-one-out prediction error',
        ylim = c(-10,50)
        )


plot(pca_anom)

# How many principle components do we wish to keep? 
npc <- 4

scores_train_anom <- pca_anom$x[ ,1:npc]

# project the truncated scores back up, to see how well they reproduce the 
# original data
  
#anom <- pca$rotation[ ,1:npc] %*% t(scores)*pca$scale
#tens <- t(anom + pca$center)
library(parallel)
scores_train_anom_list <- mat2list(scores_train_anom)

fit_list_scores_train_anom <- mclapply(X = scores_train_anom_list, FUN = km, formula = ~., design = X_train,
                                   mc.cores = 4, control = list(trace = FALSE))
scores_train_anom_em_loo <- matrix(ncol = ncol(scores_train_anom), nrow = nrow(scores_train_anom))

for(i in 1:ncol(scores_train_anom_em_loo)){
  
  pred <- leaveOneOut.km(model = fit_list_scores_train_anom[[i]], type = 'UK', trend.reestim = TRUE)
  scores_train_anom_em_loo[ ,i] <- pred$mean
  
}
par(mfrow = c(2,2))

for(i in 1:npc){

plot(scores_train_anom[,i], scores_train_anom_em_loo[,i])
  abline(0,1)
}

anom_anom_loo <- pca_anom$rotation[ ,1:npc] %*% t(scores_train_anom_em_loo[,1:npc])*pca_anom$scale
anom_tens_loo <- t(anom_anom_loo + pca_anom$center)

#anom_test <- pca$rotation[ ,1:2] %*% t(scores_em_mean_test[,1:2])*pca$scale
#tens_test <- t(anom_test + pca$center)

Y_anom_loo_err <- anom_tens_loo - Y_train_anom

par(mfrow = c(1,2))

matplot(years_trunc, t(Y_train_anom), 
        type = 'l',
        lty = 'solid',
        col = 'black',
        xlab = 'years',
        ylab = 'test',
        main = 'Anomaly ensemble',
        ylim = c(-10,30)
        )

matlines(years_trunc, t(anom_tens_loo), 
        type = 'l',
        lty = 'solid',
        col = 'red',
        )

matplot(years_trunc, t(Y_anom_loo_err), 
        type = 'l',
        lty = 'solid',
        col = 'black',
        xlab = 'years',
        ylab = 'test - predicted output',
        main = 'test prediction error',
        ylim = c(-15,15)
        )

# seeing sparkline pairs of ensemble members would be good
par(mfrow = c(20, 20), mar = c(0.05, 0.05, 0.05, 0.05))

for(i in 1:398){
  
  plot(Y_train_anom[i, ], axes = FALSE, type = 'l', lty = 'solid', ylim = c(-10,30))
  lines(anom_tens_loo[i,], col = 'red')
  
}

Test predictions are going to be the most important

scores_test_anom_em <- NULL

for(i in 1:npc){
  
  pred <- predict.km(fit_list_scores_train_anom[[i]], newdata = X_test, type = 'UK')
  scores_test_anom_em <- cbind(scores_test_anom_em, pred$mean)
  
}
#anom_loo <- pca$rotation[ ,1:2] %*% t(scores_em_mean_train[,1:2])*pca$scale
#tens_loo <- t(anom_loo + pca$center)

anom_anom <- pca_anom$rotation[ ,1:2] %*% t(scores_test_anom_em[,1:2])*pca_anom$scale
tens_anom_test <- t(anom_anom + pca_anom$center)
# seeing sparkline pairs of ensemble members would be good
par(mfrow = c(10, 10), mar = c(0.1, 0.1, 0.1, 0.1))

for(i in 1:100){
  
  plot(Y_test_anom[i, ], axes = FALSE, type = 'l', lty = 'solid', ylim = c(-10,30))
  lines(tens_anom_test[i,], col = 'red')
  
}

# Key should be that it gives you back the training fits, which you can then use for newdata etc.

pca_km <- function(X, Y, train_ix, test_ix, npc, scale = FALSE, center = TRUE, ...){
  # emulate high dimensional output  
  
  require(parallel)
  
  # Split into training and test samples
  X_train <- X[train_ix, ]
  X_test  <- X[test_ix, ]
  
  Y_train <- Y[train_ix, ]
  Y_test  <- Y[test_ix, ]
  
  
  #reduce dimension of the output
  pca <- prcomp(Y_train, scale = scale, center = center)
  
  
  scores_train_list <- mat2list(pca$x[, 1:npc])
  
  # fitting the emulator is a slow step, so we use parallel computation
  # Fit an emulator to each principal component in turn
  fit_list <- mclapply(X = scores_train_list, FUN = km, formula = ~., design = X_train,
                       mc.cores = 4, control = list(trace = FALSE, maxit = 200))
  
  scores_em_test_mean  <- NULL
  scores_em_test_sd    <- NULL
  
  scores_em_train_mean <- NULL
  scores_em_train_sd   <- NULL
  
  for(i in 1:npc){
    
    loo <- leaveOneOut.km(fit_list[[i]], type = 'UK', trend.reestim = TRUE)
    pred_test <- predict(fit_list[[i]], newdata = X_test, type = 'UK')
    
    # Predict training data (low dimension representation)
    scores_em_train_mean <- cbind(scores_em_train_mean, loo$mean)
    scores_em_train_sd <- cbind(scores_em_train_sd, loo$sd)                            
    
    # Predict test data (low dimension representation)                         
    scores_em_test_mean <- cbind(scores_em_test_mean, pred_test$mean)
    scores_em_test_sd   <- cbind(scores_em_test_sd, pred_test$sd)
    
  }
  
  # Project back up to high dimension
  if(scale){
    anom_train <- pca$rotation[ ,1:npc] %*% t(scores_em_train_mean[,1:npc])*pca$scale
    anom_test <- pca$rotation[ ,1:npc] %*% t(scores_em_test_mean[,1:npc])*pca$scale
    
    sd_train <- pca$rotation[ ,1:npc] %*% t(scores_em_train_sd[,1:npc])*pca$scale
    sd_test <- pca$rotation[ ,1:npc] %*% t(scores_em_test_sd[,1:npc])*pca$scale
  }
  
  else{
    anom_train <- pca$rotation[ ,1:npc] %*% t(scores_em_train_mean[,1:npc])
    anom_test <- pca$rotation[ ,1:npc] %*% t(scores_em_test_mean[,1:npc])
    
    sd_train <- pca$rotation[ ,1:npc] %*% t(scores_em_train_sd[,1:npc])
    sd_test <- pca$rotation[ ,1:npc] %*% t(scores_em_test_sd[,1:npc])
    
  }
  
  Ypred_train <- t(anom_train + pca$center)
  Ypred_test <- t(anom_test + pca$center)
  
  Yerr_train <- Y_train - Ypred_train
  Yerr_test <- Y_test - Ypred_test
  
  return(list(train_ix = train_ix,
              test_ix = test_ix,
              X_train = X_train,
              X_test = X_test,
              pca = pca,
              fit_list = fit_list,
              scores_em_train_mean = scores_em_train_mean,
              scores_em_train_sd = scores_em_train_sd,
              scores_em_test_mean = scores_em_test_mean,
              scores_em_test_sd = scores_em_test_sd,
              Ypred_train = Ypred_train,
              Ypred_test = Ypred_test
  ))
  
}
# How about we look at post 1950? Maybe that is messing it up?
#years_ix <- 101:164

#train_ix <- (1:399)[-288] # Number 288 has numerical problems for NPP
#test_ix  <- 400:499

X_trunc<- X[-288, ]
Y_trunc <- npp_ens_wave00[-288, years_ix ]
train_ix = 1:398
test_ix = 399:498

pc_km_npp <- pca_km(X = X_trunc,
                     Y_trunc,
                     train_ix = train_ix, test_ix = test_ix, 
                     npc = 3, 
                     scale = FALSE,
                     center = TRUE)
plot(pc_km_npp$pca$x[,1],pc_km_npp$scores_em_train_mean[,1] )
abline(0,1)

# seeing sparkline pairs of ensemble members would be good
par(mfrow = c(10, 10), mar = c(0.1, 0.1, 0.1, 0.1))

for(i in 1:100){
  
  plot(years_trunc, (Y_trunc[test_ix, ])[i, ], ylim = c(-50, 200), type = 'l', axes = FALSE)
  
  lines(years_trunc, pc_km_npp$Ypred_test[i, ], col = 'red')
  
}

npp_anom_ens_wave00 <- anomalizeTSmatrix(npp_ens_wave00, 90:111)
# How about we look at post 1950? Maybe that is messing it up?
#years_ix <- 101:164

#train_ix <- (1:399)[-288] # Number 288 has numerical problems for NPP
#test_ix  <- 400:499

X_trunc<- X[-288, ]
Y_trunc <- npp_anom_ens_wave00[-288, years_ix ]
train_ix = 1:398
test_ix = 399:498

pc_km_npp_anom <- pca_km(X = X_trunc,
                      Y_trunc,
                       train_ix = train_ix, 
                      test_ix = test_ix, 
                       npc = 5, 
                      scale = TRUE,
                      center = TRUE)
# seeing sparkline pairs of ensemble members would be good
par(mfrow = c(10, 10), mar = c(0.05, 0.05, 0.05, 0.05), oma = c(3,0.1,3,0.1))

for(i in 1:100){
  
  plot(years_trunc, (Y_trunc[test_ix, ])[i, ], ylim = c(-5, 20), type = 'l', axes = FALSE)
  lines(years_trunc, pc_km_npp_anom$Ypred_test[i, ], col = 'red')
  
}

reset()

mtext(text = 'NPP anomaly test set predictions', side = 3, cex = 1.5, line = -2, outer = TRUE)
legend('bottom', col = c('black', 'red'), legend = c('observed', 'predicted'), horiz = TRUE, lty = 'solid')

par(mfrow = c(2,1))


matplot(years_trunc, t(Y_trunc[train_ix, ]), type = 'l', lty = 'solid', col = 'black',
        main = "Train",
        ylab = 'NPP anomaly from 1940 - 1959 mean')
matlines(years_trunc, t(pc_km_npp_anom$Ypred_train), lty = 'solid', col = 'red')

legend('topleft', col = c('black', 'red'), legend = c('Observed', 'Predicted'), lty = 'solid')


matplot(years_trunc, t(Y_trunc[test_ix, ]), type = 'l', lty = 'solid', col = 'black',
        main = 'Test',
        ylab = 'NPP anomaly from 1940 - 1959 mean')
matlines(years_trunc, t(pc_km_npp_anom$Ypred_test), lty = 'solid', col = 'red')
legend('topleft', col = c('black', 'red'), legend = c('Observed', 'Predicted'), lty = 'solid')

How do things change if we exclude “zero carbon cycle” ensemble members?

McNeall et al (2023) found that carbon cycle tends to die if f0 > 0.9 or b_wl < 0.15. If these are right, they seem pretty good.

# Excise ensemble members we know to cause problems
keep_ix <- which(X[, 'f0_io'] < 0.9 & X[, 'b_wl_io'] > 0.15 )
keep_ix <- keep_ix[ - which(keep_ix == 288)]

X_trunc<- X[keep_ix, ]
Y_trunc <- npp_anom_ens_wave00[keep_ix, years_ix ]
train_ix = 1:300
test_ix = 301:374

pc_km_npp_anom <- pca_km(X = X_trunc,
                      Y_trunc,
                       train_ix = train_ix, 
                      test_ix = test_ix, 
                       npc = 5, 
                      scale = TRUE,
                      center = TRUE)
par(mfrow = c(2,1))


matplot(years_trunc, t(Y_trunc[train_ix, ]), type = 'l', lty = 'solid', col = 'black',
        main = "Train",
        ylab = 'NPP anomaly from 1940 - 1959 mean')
matlines(years_trunc, t(pc_km_npp_anom$Ypred_train), lty = 'solid', col = 'red')

legend('topleft', col = c('black', 'red'), legend = c('Observed', 'Predicted'), lty = 'solid')


matplot(years_trunc, t(Y_trunc[test_ix, ]), type = 'l', lty = 'solid', col = 'black',
        main = 'Test',
        ylab = 'NPP anomaly from 1940 - 1959 mean')
matlines(years_trunc, t(pc_km_npp_anom$Ypred_test), lty = 'solid', col = 'red')
legend('topleft', col = c('black', 'red'), legend = c('Observed', 'Predicted'), lty = 'solid')

# seeing sparkline pairs of ensemble members would be good
par(mfrow = c(10, 10), mar = c(0.05, 0.05, 0.05, 0.05), oma = c(3,0.1,3,0.1))

for(i in 1:length(test_ix)){
  
  plot(years_trunc, (Y_trunc[test_ix, ])[i, ], ylim = c(-5, 20), type = 'l', axes = FALSE)
  lines(years_trunc, pc_km_npp_anom$Ypred_test[i, ], col = 'red')
  
}

reset()

mtext(text = 'NPP anomaly test set predictions', side = 3, cex = 1.5, line = -2, outer = TRUE)
legend('bottom', col = c('black', 'red'), legend = c('observed', 'predicted'), horiz = TRUE, lty = 'solid')

# Excise ensemble members we know to cause problems
keep_ix <- which(X[, 'f0_io'] < 0.9 & X[, 'b_wl_io'] > 0.15 )
keep_ix <- keep_ix[ - which(keep_ix == 288)]

X_trunc <- X[keep_ix, ]
Y_trunc <- npp_ens_wave00[keep_ix, years_ix ]
train_ix = 1:300
test_ix = 301:374

pc_km_npp  <- pca_km(X = X_trunc,
                      Y_trunc,
                       train_ix = train_ix, 
                      test_ix = test_ix, 
                       npc = 3, 
                      scale = TRUE,
                      center = TRUE)
par(mfrow = c(2,1))


matplot(years_trunc, t(Y_trunc[train_ix, ]), type = 'l', lty = 'solid', col = 'black',
        main = "Train",
        ylab = 'NPP from 1940 - 1959 mean')
matlines(years_trunc, t(pc_km_npp$Ypred_train), lty = 'solid', col = 'red')

legend('topleft', col = c('black', 'red'), legend = c('Observed', 'Predicted'), lty = 'solid')


matplot(years_trunc, t(Y_trunc[test_ix, ]), type = 'l', lty = 'solid', col = 'black',
        main = 'Test',
        ylab = 'NPP from 1940 - 1959 mean')
matlines(years_trunc, t(pc_km_npp$Ypred_test), lty = 'solid', col = 'red')
legend('topleft', col = c('black', 'red'), legend = c('Observed', 'Predicted'), lty = 'solid')

# seeing sparkline pairs of ensemble members would be good
par(mfrow = c(10, 10), mar = c(0,0,0,0), oma = c(3,0.1,3,0.1))

for(i in 1:length(test_ix)){
  
  plot(years_trunc, (Y_trunc[test_ix, ])[i, ], ylim = c(0, 120), type = 'l', xaxt = 'n', yaxt = 'n')
  lines(years_trunc, pc_km_npp$Ypred_test[i, ], col = 'red')
  
}

reset()

mtext(text = 'NPP test set predictions', side = 3, cex = 1.5, line = -2, outer = TRUE)
legend('bottom', col = c('black', 'red'), legend = c('observed', 'predicted'), horiz = TRUE, lty = 'solid')

plot(Y_trunc[test_ix, ], pc_km_npp$Ypred_test, xlim = c(0,120), ylim = c(0,120))
abline(0,1)

Test with cumulative NBP

cnbp_ens_wave00 <-  t(apply(nbp_ens_wave00, 1, FUN = cumsum))


matplot(years, t(cnbp_ens_wave00[-288,]), col = 'black', type = 'l', lty = 'solid')